Sentinel-2 Cloud Statistics¶

In [1]:
# Load Data Cube Configuration
# from odc_gee import earthengine
# dc = earthengine.Datacube(app='Cloud_Statistics')

# Import Data Cube API
# import utils.data_cube_utilities.data_access_api as dc_api  
# api = dc_api.DataAccessApi()

# Import Data Cube Utilities
import datacube
import sys, os
os.environ['USE_PYGEOS'] = '0'
from dea_tools.plotting import rgb, display_map
from dea_tools.bandindices import calculate_indices

### EASI tools
sys.path.append(os.path.expanduser('../scripts'))
from ceos_utils.data_cube_utilities.clean_mask import landsat_clean_mask_invalid, landsat_qa_clean_mask
from easi_tools import EasiDefaults
from easi_tools import notebook_utils
easi = EasiDefaults() # Get the default parameters for this system

# Import Common Utilities
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
Successfully found configuration for deployment "eail"
In [2]:
cluster, client = notebook_utils.initialize_dask(use_gateway=False)
display(cluster if cluster else client)
print(notebook_utils.localcluster_dashboard(client, server=easi.hub))

LocalCluster

4956af27

Dashboard: http://127.0.0.1:8787/status Workers: 4
Total threads: 8 Total memory: 29.00 GiB
Status: running Using processes: True

Scheduler Info

Scheduler

Scheduler-0149332c-8778-4872-a0af-449a4a037d32

Comm: tcp://127.0.0.1:45479 Workers: 4
Dashboard: http://127.0.0.1:8787/status Total threads: 8
Started: Just now Total memory: 29.00 GiB

Workers

Worker: 0

Comm: tcp://127.0.0.1:45249 Total threads: 2
Dashboard: http://127.0.0.1:34399/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:33505
Local directory: /tmp/dask-worker-space/worker-naiqewr5

Worker: 1

Comm: tcp://127.0.0.1:41895 Total threads: 2
Dashboard: http://127.0.0.1:39371/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:45281
Local directory: /tmp/dask-worker-space/worker-dhhu6waq

Worker: 2

Comm: tcp://127.0.0.1:33453 Total threads: 2
Dashboard: http://127.0.0.1:32949/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:33327
Local directory: /tmp/dask-worker-space/worker-rk1476l0

Worker: 3

Comm: tcp://127.0.0.1:34571 Total threads: 2
Dashboard: http://127.0.0.1:36113/status Memory: 7.25 GiB
Nanny: tcp://127.0.0.1:44493
Local directory: /tmp/dask-worker-space/worker-15vbnx45
https://hub.eail.easi-eo.solutions/user/jhodge/proxy/8787/status
In [3]:
from datacube.utils.aws import configure_s3_access
configure_s3_access(aws_unsigned=False, requester_pays=True, client=client)
Out[3]:
<botocore.credentials.Credentials at 0x7f5463d94790>
In [4]:
# Select a Product
product = "s2_l2a"
In [5]:
# Select a Latitude-Longitude point for the center of the analysis region
# Select the size of the box (in degrees) surrounding the center point

# Mombasa, Kenya
# lat_long = (-4.03, 39.62)
# box_size_deg = 0.15

# Calculate the latitude and longitude bounds of the analysis box
# latitude = (lat_long[0]-box_size_deg/2, lat_long[0]+box_size_deg/2)
# longitude = (lat_long[1]-box_size_deg/2, lat_long[1]+box_size_deg/2)

# Sydney Cricket Ground
# latitude = (-33.8951, -33.8902)
# longitude = (151.2219, 151.2276)

# Sydney, Australia
# latitude = (-34.039, -33.668)
# longitude = (150.867, 151.350)

# Suva, Fiji
# latitude = (-18.1725, -18.0492) 
# longitude = (178.3881, 178.5190) 

# An Giang Provence - Vietnam
# Test Region for EY Data Challenge
# SMALL RICE CROP AREA #23
# lat_long = (10.404, 105.236)
# box_size_deg = 0.005
# Calculate the latitude and longitude bounds of the analysis box
# latitude = (lat_long[0]-box_size_deg/2, lat_long[0]+box_size_deg/2)
# longitude = (lat_long[1]-box_size_deg/2, lat_long[1]+box_size_deg/2)

latitude = easi.latitude
longitude = easi.longitude
In [6]:
# Select a time range
# The inputs require a format (Min,Max) using this date format (YYYY-MM-DD)
# The Sentinel-2 allowable time range is: 2017-03-28 to current
time_extents = ('2018-01-01', '2018-12-31')
In [7]:
# Display the analysis region
# Click on the plot to get Lat-Lon coordinates to adjust the region
# Zoom in/out on the plot to move around the globe for other regions

display_map(longitude,latitude)
/env/lib/python3.10/site-packages/dea_tools/plotting.py:313: FutureWarning: This function is deprecated. See: https://pyproj4.github.io/pyproj/stable/gotchas.html#upgrading-to-pyproj-2-from-pyproj-1
  all_longitude, all_latitude = transform(Proj(crs), Proj('EPSG:4326'), all_x,
Out[7]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Calculate cloud coverage percentage for each pixel¶

In [8]:
# Create a custom cloud coverage table here

def build_cloud_coverage_table_sentinel(product,platform,latitude,longitude,
                                        time=None,dc=None,extra_band='green',extra_load_params={}):
    
    load_params = dict(product=product,latitude=latitude,
                       longitude=longitude,group_by='solar_day',measurements=[extra_band,'SCL'],**extra_load_params)
    
    if time is not None: 
        load_params["time"] = time
        
    geo_data = dc.load(**load_params)
    
    times = list(geo_data.time.values)
    dates = [dt.astype('datetime64[D]') for dt in geo_data.time.values]
    
    scene_slice_list = list(map(lambda t: geo_data.sel(time = t), times))
    
    nodata_mask_list = (geo_data.SCL.values == 0)
    
    cloud_mask_list = (geo_data.SCL.values == 1) | (geo_data.SCL.values == 3) | (geo_data.SCL.values == 8) | \
                      (geo_data.SCL.values == 9) | (geo_data.SCL.values == 10)
    
    clean_mask_list = (~nodata_mask_list & ~cloud_mask_list)
    
    clean_percent = [clean_mask.mean()*100 for clean_mask in clean_mask_list]
    cloud_percent = [cloud_mask.mean()*100 for cloud_mask in cloud_mask_list]
    nodata_percent = [nodata_mask.mean()*100 for nodata_mask in nodata_mask_list]
    
    clean_count = list(map(np.sum, clean_mask_list))
    total_count = list(map(np.sum, ~nodata_mask_list))
    
#     data = {"Dates": dates,
#             "clean_percentage": percentage_list,
#             "clean_count": clean_pixel_count_list }
    data = {"Date": dates,"Clean_percent": clean_percent,"Cloud_percent": cloud_percent,
            "NoData_percent": nodata_percent,"Clean_count": clean_count,"Total_count": total_count}
    
    return geo_data, pd.DataFrame(data=data, columns=list(data.keys()))
In [9]:
dc = datacube.Datacube()
In [10]:
# Load the data and calculate the cloud coverage for each time slice
sentinel_dataset, coverage_table = build_cloud_coverage_table_sentinel(product = product,
                                                                       platform = "SENTINEL-2",
                                                                       latitude = latitude,
                                                                       longitude = longitude,
                                                                       time = time_extents,
                                                                       dc = dc,
                                                                       extra_band = 'green',
                                                                       extra_load_params={
                                                                         'output_crs':'EPSG:6933',
                                                                         'resolution': (-10,10),
                                                                         'skip_broken_datasets': True,
                                                                         'dask_chunks': {'time':1}
                                                                       })
/env/lib/python3.10/site-packages/rasterio/warp.py:344: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.
  _reproject(
/env/lib/python3.10/site-packages/rasterio/warp.py:344: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.
  _reproject(
/env/lib/python3.10/site-packages/rasterio/warp.py:344: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.
  _reproject(
/env/lib/python3.10/site-packages/rasterio/warp.py:344: NotGeoreferencedWarning: Dataset has no geotransform, gcps, or rpcs. The identity matrix will be returned.
  _reproject(

Create a table of cloud coverage percentage for each date¶

This table displays data for each time slice in the cube (starting at an index=0). The "clean percent" is the percent of pixels WITHOUT clouds. So, low numbers are cloudy scenes and high numbers are clearer scenes. The "Clean_count" is the number of clear pixels in the scene and the "Total_count" is the total number of pixels in the scene.

Typically, there is a separation of 5 days between Sentinel-2 scenes for a single location. This considers the availability of two missions (A and B) which is the case for most places in the world. When there is significant cloud cover, scenes may be missing from time series due to issues with processing and geolocation.

In [11]:
pd.set_option('display.max_rows', len(coverage_table))
coverage_table
Out[11]:
Date Clean_percent Cloud_percent NoData_percent Clean_count Total_count
0 2018-01-01 97.832759 2.167241 0.000000 967746 989184
1 2018-01-06 43.218249 56.781751 0.000000 427508 989184
2 2018-01-11 0.000202 17.151410 82.848388 2 169661
3 2018-01-16 0.000000 17.151612 82.848388 0 169661
4 2018-01-21 98.990077 1.009923 0.000000 979194 989184
5 2018-01-26 98.729357 1.270643 0.000000 976615 989184
6 2018-01-31 97.376626 2.623374 0.000000 963234 989184
7 2018-02-05 97.557178 2.442822 0.000000 965020 989184
8 2018-02-15 0.000000 17.151612 82.848388 0 169661
9 2018-02-20 0.000000 100.000000 0.000000 0 989184
10 2018-02-25 7.855263 92.144737 0.000000 77703 989184
11 2018-03-02 68.838558 31.161442 0.000000 680940 989184
12 2018-03-22 17.080846 0.070765 82.848388 168961 169661
13 2018-03-27 0.000000 17.151612 82.848388 0 169661
14 2018-04-01 0.025374 17.126237 82.848388 251 169661
15 2018-04-06 97.599638 2.400362 0.000000 965440 989184
16 2018-04-11 95.933011 4.066989 0.000000 948954 989184
17 2018-04-16 0.028205 17.123407 82.848388 279 169661
18 2018-04-21 97.228423 2.771577 0.000000 961768 989184
19 2018-04-26 0.000000 100.000000 0.000000 0 989184
20 2018-05-01 98.966320 1.033680 0.000000 978959 989184
21 2018-05-11 98.773939 1.226061 0.000000 977056 989184
22 2018-05-16 8.271767 8.879844 82.848388 81823 169661
23 2018-05-21 0.000000 100.000000 0.000000 0 989184
24 2018-05-26 54.824279 45.175721 0.000000 542313 989184
25 2018-06-05 97.263603 2.735992 0.000404 962116 989180
26 2018-06-10 65.417354 34.582646 0.000000 647098 989184
27 2018-06-15 50.671463 49.328537 0.000000 501234 989184
28 2018-06-20 88.229490 11.770510 0.000000 872752 989184
29 2018-06-25 31.816224 68.183776 0.000000 314721 989184
30 2018-06-30 75.658523 24.341477 0.000000 748402 989184
31 2018-07-05 24.674479 75.325521 0.000000 244076 989184
32 2018-07-10 98.589140 1.410860 0.000000 975228 989184
33 2018-07-15 0.000000 17.151612 82.848388 0 169661
34 2018-07-20 10.446085 89.553915 0.000000 103331 989184
35 2018-07-30 0.054692 17.095707 82.849601 541 169649
36 2018-08-04 97.549495 2.450505 0.000000 964944 989184
37 2018-08-09 85.186073 14.813927 0.000000 842647 989184
38 2018-08-14 99.600378 0.399622 0.000000 985231 989184
39 2018-08-19 81.700068 18.299932 0.000000 808164 989184
40 2018-08-24 66.693456 33.306544 0.000000 659721 989184
41 2018-08-29 99.505552 0.494448 0.000000 984293 989184
42 2018-09-03 48.792540 51.207460 0.000000 482648 989184
43 2018-09-08 7.830292 92.169708 0.000000 77456 989184
44 2018-09-18 70.011949 29.988051 0.000000 692547 989184
45 2018-09-23 0.250914 16.900698 82.848388 2482 169661
46 2018-09-28 2.509948 14.641664 82.848388 24828 169661
47 2018-10-03 99.622113 0.377887 0.000000 985446 989184
48 2018-10-08 29.301323 70.698677 0.000000 289844 989184
49 2018-10-13 98.290510 1.709490 0.000000 972274 989184
50 2018-10-18 98.718439 1.281561 0.000000 976507 989184
51 2018-10-23 97.836702 2.163298 0.000000 967785 989184
52 2018-10-28 97.452446 2.547554 0.000000 963984 989184
53 2018-11-07 98.867450 1.132550 0.000000 977981 989184
54 2018-11-17 94.872137 5.127863 0.000000 938460 989184
55 2018-11-22 99.026875 0.973125 0.000000 979558 989184
56 2018-11-27 98.542435 1.457565 0.000000 974766 989184
57 2018-12-07 97.328909 2.671091 0.000000 962762 989184
58 2018-12-12 92.647475 7.352525 0.000000 916454 989184
59 2018-12-17 99.269095 0.730905 0.000000 981954 989184
60 2018-12-22 7.409339 92.590661 0.000000 73292 989184
61 2018-12-27 76.611732 23.388268 0.000000 757831 989184

Create a plot of cloud coverage percentage for each date¶

In [12]:
plt.figure(figsize = (15,5))
plt.plot(coverage_table["Date"].values, coverage_table["Clean_percent"].values, 'bo', markersize=8)
plt.ylim([0, 105])
plt.show()

Review an RGB scene for a selected time slice¶

In [13]:
# Load the data to create an RGB image
sentinel_dataset = dc.load(like=sentinel_dataset,
                           product='s2_l2a',
                           measurements = ['red', 'green', 'blue', 'nir', 'swir_1', 'swir_2'],
                           dask_chunks={'time':1}) 
In [14]:
# Select one of the time slices and create an RGB image. 
# Time slices are numbered from 0 to x and shown in the table above
# Review the clean_percentage values above to select scenes with few clouds
# Clouds will be visible in WHITE for an RGB image

# RGB image options
# True-Color RGB = Red, Green, Blue
# False Color RGB (Mosaic) = SWIR2, NIR, Green

slice = 0

fig, ax = plt.subplots(1, 2, figsize=(16, 8))
true_rgb = sentinel_dataset.isel(time=slice)[['red', 'green', 'blue']].to_array()
false_rgb = sentinel_dataset.isel(time=slice)[['swir_2', 'nir', 'green']].to_array()
true_rgb.plot.imshow(ax=ax[0], vmin=0, vmax=2000)
false_rgb.plot.imshow(ax=ax[1], vmin=0, vmax=5000)
ax[0].set_title('True Color'), ax[0].xaxis.set_visible(False), ax[0].yaxis.set_visible(False)
ax[1].set_title('False Color'), ax[1].xaxis.set_visible(False), ax[1].yaxis.set_visible(False)
plt.show()
In [ ]: